#!/usr/bin/env perl
use warnings;
use strict;
use File::Basename;
##This script is to estimate the mean LTR sequence identity in the genome
##The script also extracts coordinate information for intact LTRs and all LTRs to calculate the LAI score
##Users should provide the LTR_retriever *pass.list file and the RepeatMasker *out file as inputs
##Author: Shujun Ou (oushujun@msu.edu)
##First version: 11/23/2016
##Beta v2: 02/05/2018
##Beta v3: 02/19/2018

my $usage="
The LTR Assembly Index (LAI) is developed to evaluate the assembly continunity of repetitive sequences

Usage: ./LAI -genome genome.fa -intact intact.pass.list -all genome.out [options]
Options:
	-genome [file]	The genome file that is used to generate everything.
	-intact [file]	A list of intact LTR-RTs generated by LTR_retriever (genome.fa.pass.list).
	-all [file]	RepeatMasker annotation of all LTR sequences in the genome (genome.fa.out).
	-window [int]	Window size for LAI estimation. Default: 3000000 (3 Mb)
	-step [int]	Step size for the estimation window to move forward. Default: 300000 (300 Kb)
			Set step size = window size if prefer non-overlapping outputs.
	-q		Quick estimation of LTR identity (much faster for large genomes, may sacrifice ~0.5% of accuracy).
	-qq		No estimation of LTR identity, only output raw LAI for within species comparison (very quick).
	-mono [file]	This parameter is mainly for ployploid genomes. User provides a list of sequence names that represent a monoploid (1x).
			LAI will calculated only on these sequences if provided. So user can also specify sequence of interest for LAI calculation.
	-iden [0-100]	Mean LTR identity (%) in the monoploid (1x) genome. This parameter will inactivate de novo estimation (same speed to -qq).
	-totLTR [0-100]	Specify the total LTR sequence content (%) in the genome instead of estimating from the -all RepeatMasker file.
	-blast [path]	The path to the blastn program. If left unspecified, then blastn must be accessible via shell ENV.
	-t [number]	Number of threads to run blastn.
	-h		Display this help info.
\n";

my $window="3000000"; #3Mb/window
my $step="300000"; #300Kb/step
my $iden="NA"; #mean identity (%) of LTR sequences in the monoploid (1x) genome
my $iden_slope="2.8138"; #the correction slope for LTR sequence identity (age). Don't change this unless you are confident.
my $totLTR="NA"; #the total LTR sequence content (%) in the genome. Will be estimated from the -all RepeatMasker file if unspecified.
my $quick=0; #quick estimation of LTR identity (may sacrifice ~0.5% of accuracy)
my $qquick=0; #Obtain only raw LAI for within species comparison (very quick)
my $threads=4; #Number of threads to run blastn
my $blast=''; #path to the blastn and makeblastdb
my $mode="standard mode"; #the mode for LTR identity estimation
my $fuse=1; #1, limit LTR identity to [92, 96]; 0, no limitation of LTR identity
my $monoploid=''; #A list of sequence names from the same monoploid (1x) for ployploid genomes
my $genome='';
my $intact='';
my $all='';
my $version="beta3.2";

my $k=0;
foreach (@ARGV){
	$genome=$ARGV[$k+1] if /^-genome$/i;
	$intact=$ARGV[$k+1] if /^-intact$/i;
	$all=$ARGV[$k+1] if /^-all$/i;
	$window=$ARGV[$k+1] if /^-window$/i;
	$step=$ARGV[$k+1] if /^-step$/i;
	$quick=1 if /^-q$/i;
	$qquick=1 if /^-qq$/i;
	$monoploid=$ARGV[$k+1] if /^-mono$/i;
	$iden=$ARGV[$k+1] if /^-iden$/i;
	$iden_slope=$ARGV[$k+1] if /^-k_iden$/i;
	$totLTR=$ARGV[$k+1] if /^-totLTR$/i;
	$blast=$ARGV[$k+1] if /^-blast$/i;
	$threads=$ARGV[$k+1] if /^-t$/i;
	$fuse=0 if /^-unlock$/i;
	die $usage if /^-?-h|help$/i;
	$k++;
	}
$mode="quick mode" if $quick==1;
$mode="rapid mode" if $qquick==1;
die "Input file(s) not exist!\n$usage" unless -s "$genome" and -s "$intact" and -s "$all";
my $all_file = basename($all);
`ln -s $all $all_file` unless -e $all_file;
$all = $all_file;


print "
######################################
### LTR Assembly Index (LAI) $version ###
######################################\n
Developer: Shujun Ou

Please cite:

Ou S., Chen J. and Jiang N. (2018). Assessing genome assembly quality using the LTR Assembly Index (LAI). Nucleic Acids Res. gky730: https://doi.org/10.1093/nar/gky730

Parameters: @ARGV\n\n\n";

my $date=`date`;
chomp ($date);
print "$date\tDependency checking: ";
#check blastn
$blast=`which blastn 2>/dev/null` if $blast eq '';
$blast=~s/blastn\n?$//;
$blast="$blast/" unless $blast=~/\/$/;
die "blastn is not exist in the BLAST+ path $blast!\n" unless -X "${blast}blastn";
#check makeblastdb
$blast=`which makeblastdb 2>/dev/null` if $blast eq '';
$blast=~s/makeblastdb\n$//;
die "makeblastdb is not exist in the BLAST+ path $blast!\n" unless -X "${blast}makeblastdb";
print "Passed!\n";

#obtain the exact path for the program location
my $script_path = dirname(__FILE__);

my $rand=int(rand(1000000));

#extract sequence for targeted LAI calculation
$date=`date`;
chomp ($date);
if ($monoploid ne ''){
	print "$date\tUser specified sequence list is provided: $monoploid\n\t\t\t\tCalculation of LAI will be based only on these sequences.\n";
	`perl $script_path/bin/output_by_list.pl 1 $genome 1 $monoploid -FA > $genome.$monoploid.fa`;
	`for i in \`cat $monoploid\`; do cat $all | perl -snle 's/^\\s+//; my \$chr=(split)[4]; \$i=~s/^>//; print \$_ if \$chr eq \$i' -- -i=\$i; done > $all.$monoploid.out`;
	$genome = "$genome.$monoploid.fa";
	$all = "$all.$monoploid.out";
	} else {
	print "$date\tCalculation of LAI will be based on the whole genome.\n\t\t\t\tPlease use the -mono parameter if your genome is a recent ployploid, for high identity between homeologues will overcorrect raw LAI scores.\n";
	}

#estimate mean identity of LTR sequences in the genome
if ($qquick != 1 and $iden eq "NA"){
	$date=`date`;
	chomp ($date);
	print "$date\tEstimate the identity of LTR sequences in the genome: $mode\n";
	$iden=`perl $script_path/bin/Age_est.pl -RMout $all -genome $genome -blast $blast -t $threads -iden_cut 100 -q ` if $quick == 1;
	$iden=`perl $script_path/bin/Age_est.pl -RMout $all -genome $genome -blast $blast -t $threads -iden_cut 100 ` if $quick == 0;
	$iden = $1 if $iden=~/Mean_identity:([0-9.]+)/;
	}

#update info
$date=`date`;
chomp ($date);
print "$date\tThe identity of LTR sequences: $iden%\n";
if ($fuse==1 and $qquick==0){
	($iden=92,  print "\n\t\t\t\t【Warning】 The identity drops below the safe limit. Instead, identity of 92% will be used for LAI adjustment.\n\n") if $iden<92;
	($iden=96,  print "\n\t\t\t\t【Warning】 The identity jumps above the safe limit. Instead, identity of 96% will be used for LAI adjustment.\n\n") if $iden>96;
	} 
elsif ($fuse==0 and $qquick==0) {
	print "\n\t\t\t\t【Warning】 The identity drops below the safe limit under the unlock mode.\n\n" if $iden<92;
	print "\n\t\t\t\t【Warning】 The identity jumps above the safe limit under the unlock mode.\n\n" if $iden>96;
	}

print "$date\tCalculate LAI:\n";
#convert intact LTR information into bed format
`perl -nle 's/^\\s+//; my (\$id, \$age)=(split)[0,11]; \$age=4000000 if \$age eq \"NA\"; next unless \$id=~/:/; \$id=~s/\\.\\./\\t/g; \$id=~s/:/\\t/g; print "\$id\\t\$age"' $intact > $intact.bed`;

#get all LTR sequence annotated in the genome that are no shorter than 80bp
`awk '{if (\$6~/[0-9]/ && \$7-\$6+1>=80 && \$11~/LTR/) print \$5"\\t"\$6"\\t"\$7}' $all > $all.bed`;

#calculate the LAI score
`perl $script_path/bin/LAI_calc3.pl -intact $intact.bed -all $all.bed -genome $genome -window $window -step $step -iden $iden -k_iden $iden_slope -totLTR $totLTR > $all.LAI`;

`rm $intact.bed $all.bed`;

my $info=`grep whole_genome $all.LAI`;
$info=~s/^\s+//;
my ($int, $total)=(split /\s+/, $info)[3,4];
($int, $total)=($int*100, $total*100); #convert to %
if (($int<0.1 or $total<5) and $fuse==1){
	print "\n\t\t\t\t【Error】Intact LTR-RT content ($int%) is too low for accurate LAI calculation (min 0.1% required)\n" if $int<0.1;
	print "\t\t\t\t【Error】 Total LTR sequence content ($total%) is too low for accurate LAI calculation (min 5% required)\n" if $total<5;
	`rm $all.LAI`;
	print "\n\t\t\t\tSorry, LAI is not applicable on the current genome assembly.\n\n";
	} else {
	print "\n\t\t\t\t【Warning】Intact LTR-RT content ($int%) is too low for accurate LAI calculation (min 0.1% preferred)\n" if $int<0.1;
	print "\t\t\t\t【Warning】 Total LTR sequence content ($total%) is too low for accurate LAI calculation (min 5% preferred)\n" if $total<5;
	print "\n\t\t\t\t\t\tDone!\n\n";
	$date=`date`;
	chomp ($date);
	print "$date\tResult file: $all.LAI\n\n";
	print "\t\t\t\tYou may use either raw_LAI or LAI for intraspecific comparison\n\t\t\t\tbut please use ONLY LAI for interspecific comparison\n\n";
	}
